# -*- coding: utf-8 -*-

import os
class UrlDataFromBaidu:
    X=[]
    Y=[]

    dir = "Data/BaiduURLs"
    def __init__(self):

        dirs = os.listdir(self.dir)
        print( "Number of Catograies:", len(dirs))
        for d in dirs:
            f = open(self.dir + os.sep + d)
            lines = f.readlines()
            for line in lines:
                if line.endswith("\n"):
                    line = line[:-1]
                s = line.split(";")
                if len(s) < 1:
                    continue
                u = s[0]
                if len(u) == 0:
                    continue
                if u.startswith("http://www.baidu.com/link"):
                    continue
                title = ""
                if len(s) > 1:
                    title = s[1]
                self.X.append([u,title])
                self.Y.append(d)

        print ("Number of X:",len(self.X))
        print ("Number of Y:",len(self.Y))
        pass

    def convert_to_two_level(self):
        _Y=[]
        for y in self.Y:
            s = y.split(".")
            if len(s) == 3:
                _Y.append(".".join(s[0:2]))
            else:
                _Y.append(y)
        self.Y = _Y
        assert(len(self.X) == len(self.Y))
        return self

if __name__ == "__main__":
    import os
    os.chdir("..")
    data = UrlDataFromBaidu().convert_to_two_level()
    print (data.Y)
    pass